# ==============================================================================
# DESCRIPTION ----
# ==============================================================================
# Last updated 5 August, 2025 by Trevor Incerti

# This file contains user-defined functions used throughout code 
# cleaning and analysis

# ==============================================================================
# FUNCTION TO IMPORT AND APPEND ALL FILES FROM A DIRECTORY ----
# ==============================================================================

# Import/define pipe operator from magrittr ------------------------------------
`%>%` <- magrittr::`%>%`

# Helper functions -------------------------------------------------------------
read_flnm <- function(flnm, delim = NULL, skip = NULL) {
  read_delim(flnm, delim = delim, skip = skip, 
             col_types = cols(.default = "c")) %>% 
    mutate(filename = tools::file_path_sans_ext(fs::path_file(flnm)))
}

read_flnm_xl <- function(flnm, sheet = NULL, skip = NULL, col_types = NULL) {
  readxl::read_excel(flnm, sheet = sheet, skip = skip, col_types = col_types) %>% 
    mutate(filename = tools::file_path_sans_ext(fs::path_file(flnm)))
}

# Main function: read in and append all files in a directory ------------------ 
# Function arguments:
# Path = filepath of directory where data files are located.
# Extension = data files extension. Currently accepts:
# all extensions compatible with readr::read_delim and "xlsx" for Excel.
# delim = Single character used to separate fields within a record, e.g. ",".
# sheet = Sheet to import if importing from Excel. 
# skip = Number of rows to skip when importing each file.

read_dir = function(path, extension, delim, filename, sheet = NULL, skip = 0,
                    col_types = NULL) {
  
  # Stop and display errors if conflicting arguments are entered
  if (!missing(sheet) & extension != "xlsx") {
    stop("Error: Argument 'sheet' only applies to Excel files")
    
    # Read in delimited text data files
  } else if (filename == FALSE & extension != "xlsx") {
    list.files(path = path,
               pattern = paste0("*.", extension),
               full.names = T) %>%
      purrr::map_df(~read_delim(., delim = delim, skip = skip, 
                                col_types = cols(.default = "c")))
    
  } else if (filename == TRUE & extension != "xlsx") {
    list.files(path = path,
               pattern = paste0("*.", extension),
               full.names = T) %>%
      purrr::map_df(~read_flnm(., delim = delim, skip = skip))
    
    # Read in Excel data files  
  } else if (extension == "xlsx" & filename == F) {
    list.files(path = path,
               pattern = paste0("*.", extension),
               full.names = T) %>%
      purrr::map_df(~readxl::read_excel(., sheet = sheet, skip = skip,
                                        col_types = col_types))
    
  } else if (extension == "xlsx" & filename == T) {
    list.files(path = path,
               pattern = paste0("*.", extension),
               full.names = T) %>%
      purrr::map_df(~read_flnm_xl(., sheet = sheet, skip = skip,
                                  col_types = col_types))
  }
}

# ==============================================================================
# FUNCTIONS FOR CONVERSION OF JAPANESE DATA FORMATTING ----
# ==============================================================================

#### Function to convert from full-width to half-width characters ####
zen2han <- function(s){
    if(any(Encoding(s) != "UTF-8"))  s <- iconv(s, from = "", to = "UTF-8")
    s <- paste(s, sep='', collapse='')
    y <- sapply(unlist(strsplit(s, split = "")), function(x){
        i <- utf8ToInt(x)
        if(i >= 65281 && i <= 65374){
            return(intToUtf8(i - 65248))
        }else{
            return(x)
        }
    })
    return(paste(y, collapse = ""))
}

sanitizeZenkaku <-function(s){
  stopifnot(is.character(s))
  if(Encoding(s[1]) != "UTF-8")  s <- iconv(s, from = "", to = "UTF-8")
  zenEisu <- paste0(intToUtf8(65295 + 1:10), intToUtf8(65312 + 1:26),
                    intToUtf8(65344 + 1:26))
  zenKigo <- c(65281, 65283, 65284, 65285, 65286, 65290, 65291,
               65292, 12540, 65294, 65295, 65306, 65307, 65308,
               65309, 65310, 65311, 65312, 65342, 65343, 65372,
               65374)
  s <- chartr(zenEisu,"0-9A-Za-z", s)
  s <- chartr(intToUtf8(zenKigo), '!#$%&*+,-./:;<=>?@^_|~', s)
  s <- gsub(intToUtf8(12288), "", s)
  return(s)
}

#### Function to convert Kanji to Romaji ####
kakasi <- function(x, kakasi.option="-Ha -Ka -Ja -Ea -ka",
                   ITAIJIDICTPATH=Sys.getenv("ITAIJIDICTPATH", unset = NA),
                   KANWADICTPATH=Sys.getenv("KANWADICTPATH", unset = NA)){
  if(is.na(ITAIJIDICTPATH)){
    Sys.setenv(ITAIJIDICTPATH=.set.dict("itaijidict"))
  }else{
    stopifnot(file.exists(ITAIJIDICTPATH))
    Sys.setenv(ITAIJIDICTPATH=ITAIJIDICTPATH)
  }
  if(is.na(KANWADICTPATH)){
    Sys.setenv(KANWADICTPATH=.set.dict("kanwadict"))
  }else{
    stopifnot(file.exists(KANWADICTPATH))
    Sys.setenv(KANWADICTPATH=KANWADICTPATH)
  }
  stopifnot(is.character(x))
  ops <- strsplit(kakasi.option, " ")[[1]]
  ## This is a trick to work correctly
  ops <- c(" ", ops)
  stopifnot(length(ops) != 0)
  lc.ctype <- Sys.getlocale("LC_CTYPE")
  if (lc.ctype != "ja_JP.UTF-8" || lc.ctype != ""){
    if (Sys.getlocale("LC_CTYPE") == "Japanese_Japan.932") x <- sjis2utf8(x)
  }else{
    warning("kakasi() assumes \"ja_JP.UTF-8\" for LC_TYPE")
  }
  u <- sapply(x, function(i){
    .Call("rkakasi", x = i, k = ops, PACKAGE = "Nippon")
  })
  return(u)
}

# ==============================================================================
# COALESCE JOIN FUNCTION (A LA SQL) ----
# ==============================================================================

coalesce_join <- function(x, y, 
                          by = NULL, suffix = c(".x", ".y"), 
                          join = dplyr::left_join, ...) {
  joined <- join(x, y, by = by, suffix = suffix, ...)
  # names of desired output
  cols <- union(names(x), names(y))
  
  to_coalesce <- names(joined)[!names(joined) %in% cols]
  suffix_used <- suffix[ifelse(endsWith(to_coalesce, suffix[1]), 1, 2)]
  # remove suffixes and deduplicate
  to_coalesce <- unique(substr(
    to_coalesce, 
    1, 
    nchar(to_coalesce) - nchar(suffix_used)
  ))
  
  coalesced <- purrr::map_dfc(to_coalesce, ~dplyr::coalesce(
    joined[[paste0(.x, suffix[1])]], 
    joined[[paste0(.x, suffix[2])]]
  ))
  names(coalesced) <- to_coalesce
  
  dplyr::bind_cols(joined, coalesced)[cols]
}

# ==============================================================================
# NIKKEI DATA COMBINING AND CLEANING FUNCTION ----
# ==============================================================================
# Create function to read in and append Nikkei NEEDS data
read_plus <- function(flnm, delim) {
  readxl::read_excel(flnm, sheet = "Sheet1", skip = 1) %>% 
    slice(-1) %>% 
    mutate(filename = tools::file_path_sans_ext(fs::path_file(flnm))) 
}

read_nikkei = function(path, extension, filename) {
  
  if (filename == TRUE) {
    list.files(path = path,
               pattern = paste0("*.", extension),
               full.names = T) %>%
      map_df(~read_plus(.))
    
  } else {
    
    list.files(path = path,
               pattern = paste0("*.", extension),
               full.names = T) %>%
      map_df(~read_delim(.))
  }
}

# ==============================================================================
# WRAPPER FUNCTION FOR PANELMATCH ----
# ==============================================================================
# Makes running multiple panelmatch models less repetitive
run_panelmatch <- function(data, 
                                 treatment, 
                                 outcome, 
                                 lag = 1, 
                                 lead = 0:5, 
                                 refinement.method = "mahalanobis", 
                                 covariates = ~ total_assets + total_liabilities + employees + ebitda + gross_profit + operating_revenue,
                                 unit.id = "nikkei_code", 
                                 time.id = "year",
                                 number.iterations = 1000,
                                 match.missing = TRUE,
                                 forbid.treatment.reversal = TRUE,
                                 placebo.test = TRUE) {
  
  # Inner function to clean the result table
  clean_loan_result_table <- function(df) {
    df %>%
      mutate(across(estimate:`97.5%`, ~ round(., 2))) %>%
      select(
        `Time window` = time_window,
        Estimate = estimate,
        SE = std.error,
        `95% CI lower` = `2.5%`,
        `95% CI upper` = `97.5%`
      )
  }
  
  # Step 1: Create PanelData object
  panel <- PanelData(panel.data = data,
                     unit.id = unit.id,
                     time.id = time.id,
                     treatment = treatment,
                     outcome = outcome)
  
  # Step 2: Run PanelMatch
  set.seed(999)
  matched <- PanelMatch(panel.data = panel, 
                        lag = lag,
                        lead = lead,
                        refinement.method = refinement.method,
                        match.missing = match.missing,
                        covs.formula = covariates,
                        qoi = "att",
                        forbid.treatment.reversal = forbid.treatment.reversal,
                        placebo.test = placebo.test)
  
  # Step 3: Estimate results
  estimated <- PanelEstimate(sets = matched, 
                             panel.data = panel, 
                             number.iterations = number.iterations)
  
  # Step 4: Format result tables
  raw_table <- as.data.frame(summary(estimated))
  raw_table <- cbind(time_window = rownames(raw_table), raw_table)
  rownames(raw_table) <- NULL
  
  clean_table <- clean_loan_result_table(raw_table)
  
  # Step 5: Return a list with both tables (and optionally the objects too)
  return(list(
    cleaned_result_table = clean_table,
    raw_result_table = raw_table,
    estimate_object = estimated,
    match_object = matched,
    panel_object = panel
  ))
}

# Placebo tests ----------------------------------------------------------------
get_coef_placebo <- function(placebo_obj, label) {
  est <- placebo_obj$estimates[1]
  se <- placebo_obj$standard.errors[1]
  data.frame(
    group = label,
    estimate = est,
    se = se,
    ci_lower = est - 1.96 * se,
    ci_upper = est + 1.96 * se
  )
}

# ==============================================================================
# Plot designs ----
# ==============================================================================
# Matching ---------------------------------------------------------------------
# TSCS Matching
gglayers = list(
  geom_hline(aes(yintercept = 0), linetype = 2, color = "grey"),
  geom_point(color = "steelblue2", size = 2),
  geom_errorbar(aes(ymin = summary.2.5., ymax = summary.97.5.), 
                color = "grey75", width = 0.1),
  xlab("Year(s) since hire"),
  ylab("Estimated effect of hire on loan volume (million yen)"),
  scale_y_continuous(limits = c(-5000, 10000), 
                     breaks= seq(-5000, 10000, 2000)),
  theme_classic(),
  theme(axis.title = element_text(size = 14)),
  theme(axis.text=element_text(size = 14)),
  theme(axis.text.x = element_text(size = 14))
)

# TSCS Matching Lead Robustness
gglayers_leads = list(
  aes(x = `Time window`, y = Estimate),
  geom_hline(aes(yintercept = 0), linetype = 2, color = "grey"),
  geom_point(color = "steelblue2", size = 2),
  geom_errorbar(aes(ymin = `95% CI lower`, ymax = `95% CI upper`), 
                color = "grey75", width = 0.1),
  xlab("Year(s) since hire"),
  ylab("Fffect of hire on loan volume"),
  scale_y_continuous(limits = c(-10000, 15000), 
                     breaks= seq(-10000, 15000, 5000)),
  theme_classic(),
  theme(axis.title = element_text(size = 14)),
  theme(axis.text=element_text(size = 14)),
  theme(axis.text.x = element_text(size = 14))
)

# TSCS Matching 10% confidence
gglayers_10 = list(
  geom_hline(aes(yintercept = 0), linetype = 2, color = "grey"),
  geom_point(color = "steelblue2", size = 2),
  geom_errorbar(aes(ymin = summary.5., ymax = summary.95.), 
                color = "grey75", width = 0.1),
  xlab("Year(s) since hire"),
  ylab("Estimated effect of hire on loan volume (billion yen)"),
  scale_y_continuous(limits = c(-5000, 10000), 
                     breaks= seq(-5000, 10000, 2000)),
  theme_classic(),
  theme(axis.title = element_text(size = 14)),
  theme(axis.text=element_text(size = 14)),
  theme(axis.text.x = element_text(size = 14))
)

# TCSC Matching Update new panelmatch version 2025 
gglayers_update = list (
  aes(x = `Time window`, y = Estimate),
  geom_hline(aes(yintercept = 0), linetype = 2, color = "grey"),
  geom_point(color = "steelblue2", size = 2),
  geom_errorbar(aes(ymin = `95% CI lower`, ymax = `95% CI upper`),
                color = "grey75", width = 0.1),
  xlab("Year(s) since hire"),
  ylab("Estimated effect of hire on loan volume (million yen)"),
  theme_classic(),
  theme(axis.title = element_text(size = 14)),
  theme(axis.text=element_text(size = 14)),
  theme(axis.text.x = element_text(size = 14))
)
# ==============================================================================
# TABLE DESIGNS AND EXPORT ----
# ==============================================================================
clean_loan_result_table <- function(df) {
  df %>%
    mutate(across(estimate:`97.5%`, ~ round(., 2))) %>%
    select(
      `Time window` = time_window,
      Estimate = estimate,
      SE = std.error,
      `95% CI lower` = `2.5%`,
      `95% CI upper` = `97.5%`
    )
}
